#!/bin/bash
if [ $# -lt 3 ];
then
    echo "usage: clusterByLsa.sh src_file dstFile_dId_cId  dstFile_cId_dIds";
    exit
else
    srcFile=$1
    dstFile1=$2
    dstFile2=$3
fi
awk '{
    documentId = $1;
    idLength=split($2, tmpArray, ",");
    if (tmpArray[1] > tmpArray[2]){ 
        secondFeature = tmpArray[2];
        classId = 2;
        maxFeature = tmpArray[1];
        classMax = 1;
    }
    else{
        secondFeature = tmpArray[1];
        classId = 1;
        maxFeature = tmpArray[2];
        classMax = 2;
    }
    for (cId=3; cId<=idLength;  ++cId) {
        if (length(tmpArray[cId]) < 1) continue;
        if (maxFeature < tmpArray[cId]) {
            secondFeature = maxFeature;
            classId = classMax;
            maxFeature = tmpArray[cId];
            classMax = cId;
        } else if (secondFeature < tmpArray[cId]) {
            secondFeature = tmpArray[cId];
            classId = cId;
        }
    }
    printf("%d %d %f\n", documentId, classMax, maxFeature);
}' $srcFile > $dstFile1

#store all the clusters
awk '{
    if(cluster[$2]) {
        cluster[$2] = cluster[$2]","$1;
    }
    else {
        cluster[$2] = $2" "$1;
    }
}
END {
    for (cId in cluster) {
        print cluster[cId];
    }
}' $dstFile1 > $dstFile2
